/********************************************************************************
Discrimination in Multi-Phase Systems: Evidence from Child Protection

Created on: 12/28/2022
Last Modified on: 2/13/2024

Description: This program generates the main sample restrictions for the screener 
analysis sample.

Note that we have removed the file directory names from this program for 
confidentiality reasons.
********************************************************************************/

**************************
**(0) SETUP
**************************
clear
set more off
macro drop all
capture log close
set seed 02042023

*Set directories
global cleandata 
global tmpdata 
global rawdata 
global output 

*************************
**(1) CLEAN RAW FILE OF SCREENED-IN CALLS, WITHOUT SAMPLE RESTRICTIONS YET
*************************
clear 
import delimited "${rawdata}screened_in_calls.csv"

**For now, keep only alleged victims
keep if child_role == "Alleged Victim (AV)"
gen screened = 1
rename childzip5frmip zipcode_vic 

**Generate covariates 
*Allegation type 
gen tmp_phyab = allegationtype =="Physical Abuse"
gen tmp_phyneg = allegationtype =="Physical Neglect"
gen tmp_impsup = allegationtype =="Improper Supervision"
gen tmp_threat = allegationtype=="Threatened Harm" 
gen tmp_sexab = allegationtype =="Sexual Abuse" 
gen tmp_medneg = allegationtype=="Medical Neglect"
gen tmp_failprot = allegationtype=="Failure To Protect"
gen tmp_maltreatment = allegationtype=="Maltreatment"
gen tmp_miss_alleg = allegationtype=="NA" 


foreach x in phyab phyneg impsup threat sexab medneg failprot maltreatment miss_alleg {
gegen `x' = max(tmp_`x'), by (childpartyid intake_id)
drop tmp_`x'
}

*Reporter role 
tab reporter_category_desc
gen tmp_edu = reporter_cat=="School" 
gen tmp_law = reporter_cat=="Law Enforcement" 
gen tmp_family = reporter_cat=="Family"
gen tmp_medical = reporter_cat=="Medical"
gen tmp_other = reporter_cat=="Other"
gen tmp_counselor = reporter_cat=="Counselor/Therapist"
gen tmp_miss_reporter = reporter_category_desc=="NA"
gen tmp_bcal = reporter_category_desc=="BCAL"
gen tmp_provider = reporter_category_desc=="Provider"
gen tmp_court = reporter_category_desc=="Court"
gen tmp_mdhhs = reporter_category_desc=="MDHHS"
gen tmp_birthmatch = reporter_category_desc=="Birth Match"
gen tmp_clergy = reporter_category_desc=="Clergy"

foreach x in edu law family medical other counselor court mdhhs clergy birthmatch miss_reporter provider bcal {
gegen `x' = max(tmp_`x'), by (childpartyid intake_id)
drop tmp_`x'
}

*Alleged perpetrator type 
gen tmp_dad = relationtypeperptovictim =="Biological Father" | relationtypeperptovictim =="Legal Father" | relationtypeperptovictim=="Stepfather"
gen tmp_mom = relationtypeperptovictim =="Biological Mother" | relationtypeperptovictim =="Legal Mother" | relationtypeperptovictim=="Stepmother"
gen tmp_rel = relationtypeperptovictim =="Grand Mother" | relationtypeperptovictim == "Grand Father" | relationtypeperptovictim =="Uncle" | relationtypeperptovictim =="Aunt" | relationtypeperptovictim=="Cousin"
gen tmp_notrel = relationtypeperptovictim =="Not Related"

foreach x in dad mom rel notrel {
gegen `x' = max(tmp_`x'), by (childpartyid intake_id)
drop tmp_`x'
}

gduplicates drop childpartyid intake_id, force 

**Child's race 
cap drop whi bla 
gen white = racedesc=="White"
gen black = racedesc=="Black/African American"

**Keep only relevant variables
keep intake_id childpartyid complaint_date complaint_dttm scrnr_last_nm scrnr_first_nm supv_last_nm zipcode_vic  ///
screeningdecision complaint_date whi bla child_age child_sex county screened ///
phyab phyneg impsup threat sexab medneg failprot maltreatment miss_alleg dad mom rel notrel ///
edu law family medical other counselor court mdhhs clergy birthmatch miss_reporter provider bcal

**Save this temporary dataset
save "${tmpdata}screened_in_calls_qje.dta", replace 

*************************
**(2) CLEAN RAW FILE OF SCREENED-OUT CALLS, WITHOUT SAMPLE RESTRICTIONS YET
*************************
clear 
import delimited "${rawdata}screened_out_calls.csv"

**Generate "screened" variable
gen screened=0 
replace screened=1 if screeningdecision=="Accept and Link" //there are a few screened-in calls in this file 

**Keep only relevant variables 
keep intake_id childpartyid scrnr_last_nm scrnr_first_nm complaint_date racedesc  ///
complaint_dttm child_sex screeningdecision allegationtype reporter_cat child_age county screened 

**Generate race variable 
tab racedesc
gen white = racedesc=="White"
gen black = racedesc=="Black" 

**Generate covariates 
*Allegation type 
tab allegationtype, m 
gen tmp_phyab = allegationtype =="Physical Abuse"
gen tmp_phyneg = allegationtype =="Physical Neglect"
gen tmp_impsup = allegationtype =="Improper Supervision"
gen tmp_threat = allegationtype=="Threatened Harm" 
gen tmp_sexab = allegationtype =="Sexual Abuse" 
gen tmp_medneg = allegationtype=="Medical Neglect"
gen tmp_failprot = allegationtype=="Failure To Protect"
gen tmp_maltreatment = allegationtype=="Maltreatment"
gen tmp_miss_alleg = allegationtype=="NA" 

foreach x in phyab phyneg impsup threat sexab miss_alleg medneg failprot maltreatment {
gegen `x' = max(tmp_`x'), by (childpartyid intake_id)
drop tmp_`x'
}

*Reporter role 
tab reporter_category_desc
gen tmp_edu = reporter_cat=="School" 
gen tmp_law = reporter_cat=="Law Enforcement" 
gen tmp_family = reporter_cat=="Family"
gen tmp_medical = reporter_cat=="Medical"
gen tmp_other = reporter_cat=="Other"
gen tmp_counselor = reporter_cat=="Counselor/Therapist"
gen tmp_miss_reporter = reporter_category_desc=="NA"
gen tmp_bcal = reporter_category_desc=="BCAL"
gen tmp_provider = reporter_category_desc=="Provider"
gen tmp_court = reporter_category_desc=="Court"
gen tmp_mdhhs = reporter_category_desc=="MDHHS"
gen tmp_birthmatch = reporter_category_desc=="Birth Match"
gen tmp_clergy = reporter_category_desc=="Clergy"

foreach x in edu law family medical other counselor court mdhhs clergy birthmatch miss_reporter provider bcal {
gegen `x' = max(tmp_`x'), by (childpartyid intake_id)
drop tmp_`x'
}


**Drop duplicates 
gduplicates drop childpartyid intake_id, force 

**Save this temporary dataset
save "${tmpdata}screened_out_calls_qje.dta", replace 


*************************
**(3) GENERATE UNIVERSE OF HOTLINE CALLS, WITHOUT SAMPLE RESTRICTIONS YET
*************************
**Start with screened-out calls and append screened-in calls 
use "${tmpdata}screened_out_calls_qje.dta", clear
append using "${tmpdata}screened_in_calls_qje.dta"
tab screened

**Drop small number of observations with invalid child ids 
drop if childpartyid==0 
drop if childpartyid==99

**Generate unique screener ids 
foreach x in scrnr_last_nm scrnr_first_nm {
	replace `x'=subinstr(`x'," ","",.)
	replace `x'=subinstr(`x',",","",.)
	replace `x'=subinstr(`x',".","",.)	
	replace `x'=subinstr(`x',"-","",.)	
	replace `x'=lower(`x')
}

egen screener = group(scrnr_first_nm scrnr_last_nm)
sum screener 

**Keep only variables of interest 
keep childpartyid intake_id complaint_date complaint_dttm screened whi bla ///
child_age child_sex county screener scrnr_first_nm scrnr_last_nm phyab phyneg impsup threat sexab miss_alleg ///
medneg failprot maltreatment court mdhhs clergy birthmatch miss_reporter provider bcal edu law family medical other counselor zipcode_vic

**Generate primary outcome: subject of another investigation within six months 
*Make the date variable uniform
gen cw_date = complaint_date
replace cw_date = substr(complaint_date,1,10) if screened==0
drop complaint_date

*Gen a stata date variable 
gen cw_date_stata = date(cw_date,"YMD")
order cw_date_stata 

gen cw_date2 = substr(cw_date,1,10) 
order cw_date2

gen cw_date_stata2 = date(cw_date2,"YMD")
replace cw_date_stata = cw_date_stata2 if cw_date_stata==.

*For a given focal child X inv observation, did you have another *investigation* within 6 months?
foreach x in inv6m {
	gen `x'=.
}

order childpartyid cw_date screened inv*

gsort childpartyid cw_date_stata 
bysort childpartyid: replace inv6m = 1 if inrange(cw_date_stata[_n+1], cw_date_stata+1, cw_date_stata+180)

foreach x in inv6m {
	replace `x' = 0 if `x'==.
}

foreach x in inv6m {
	replace `x'=0 if screened[_n+1]==0
}

foreach x in 6 {
	label var inv`x'm "Subject of another investigation within `x' months"
}


**Generate additional covariate: child had a previous investigation / number of previous investigations 
cap drop chron_order chron prev_contact 
gsort childpartyid cw_date_stata2
bysort childpartyid: gen chron = _n 

gsort childpartyid cw_date_stata2
bysort childpartyid (chron): gen cum_inv = sum(screened)
order cum_inv

gen num_prev_inv = cum_inv 
gen prev_inv = num_prev_inv>1 
sum prev_inv num_prev_inv  
drop chron 

**Save this temporary dataset 
save "${tmpdata}all_hotline_calls_qje.dta", replace 

*************************
**(4) GENERATE MAIN SAMPLE RESTRICTIONS 
*************************
use "${tmpdata}all_hotline_calls_qje.dta", clear 

**Keep only investigations from 2017-2019, and of white and black children only 
keep if white==1 | black==1

*Generate year of the investigation 
gen cps_year = year(cw_date_stata)
tab cps_year 
keep if cps_year<=2019

*Keep only calls prior to June 30, 2019 (since we need to observe outcomes within the next 6 months)
keep if cw_date_stata<=21730

**Merge in foster care placement data and worker ids //this is a file containing a removal flag and worker ids for each investigator, obtained from the state// 
merge 1:1 childpartyid intake_id using "${tmpdata}2017_19_removals.dta", keep(1 3)
drop if _merge==1&screened==1
replace fc=0 if fc==.

**Drop "automated" screeners
drop if scrnr_last_nm=="mandatedreporter"
drop if scrnr_last_nm=="batchjob110"

save "${tmpdata}all_hotline_calls_main_restrictions_qje.dta", replace 